Background

We want to follow-up a number of driver SNPs for our Myositis-relevant components in the IMD basis (PC1-3, 8, 9, 12, and 13). To do so, we retrieved eQTL summary statistics data from eQTL catalogue for the driver SNPs for these components. We further filtered those SNPs to include only those included in credible sets performed by eQTL catalogue using SuSIE, to be more sure that the SNPs are potentially causal. See below for more details on implementation.

First we load libraries and the list of SNPs

library(data.table)
library(readr)
library(seqminer)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(cowplot)
library(gprofiler2)
# The following packages are Bioconductor's so they need to be installed differently
library(GenomicRanges)
library(biomaRt)
library(STRINGdb)
library(VennDiagram)
library(RColorBrewer)

Load basic data and tissue selection

Here we’ll use data from the eQTLcatalogue, which we’ll additionally filter by information present in the credible sets computed by eQTL catalogue. This will help us tell which SNPs are truly causal eQTL for the genes and hence which eSNP-eGene are real (Note: Since latest update [June 2021] GTEx actually corresponds to GTEx v8, so GTEx_v8 is now redundant). Since I didn’t have a manifest for the credible paths (only URLs) I had to make some modifications in some names for both datasets to fit nicely. This involved some manual editing from the credible sets URLs, which resulted in credible_paths_processed.tsv.

We performed these steps for cell basis v3 (varimax) before, so some of the files are available in the corresponding directory.

# Let's import the relevant files, for report rendering.
snp <- fread("../data/Myositis_IMD_driver_SNPs.tsv")
setnames(snp, "SNPID", "SNPID.basis")

ecat <- fread("../data/Combined_eQTL_credible_paths.tsv") # This dataset was copied from blood cell basis v3 repository, since the sources are the same

Now let’s take a look at the available tissues

table(ecat$tissue_label)
## 
##                           adipose                adipose (visceral) 
##                                12                                 4 
##                     adrenal gland                    artery (aorta) 
##                                 4                                 4 
##                 artery (coronary)                   artery (tibial) 
##                                 4                                 4 
##                            B cell                             blood 
##                                 6                                12 
##                  brain (amygdala) brain (anterior cingulate cortex) 
##                                 4                                 4 
##                   brain (caudate)     brain (cerebellar hemisphere) 
##                                 4                                 4 
##                brain (cerebellum)                    brain (cortex) 
##                                 4                                 4 
##                     brain (DLPFC)               brain (hippocampus) 
##                                16                                 4 
##              brain (hypothalamus)         brain (nucleus accumbens) 
##                                 4                                 4 
##                   brain (putamen)               brain (spinal cord) 
##                                 8                                 4 
##          brain (substantia nigra)          brain (substantia_nigra) 
##                                 4                                 4 
##                            breast                    CD16+ monocyte 
##                                 4                                 4 
##                       CD4+ T cell                       CD8+ T cell 
##                                14                                10 
##                   esophagus (gej)                esophagus (mucosa) 
##                                 4                                 4 
##            esophagus (muscularis)                        fibroblast 
##                                 4                                 8 
##          heart (atrial appendage)            heart (left ventricle) 
##                                 4                                 4 
##                        hepatocyte              high grade cartilage 
##                                 4                                 4 
##                             ileum                              iPSC 
##                                 1                                12 
##                   kidney (cortex)                               LCL 
##                                 4                                24 
##                             liver               low grade cartilage 
##                                 4                                 4 
##                              lung                        macrophage 
##                                 4                                28 
##                         microglia              minor salivary gland 
##                                 4                                 4 
##                          monocyte                            muscle 
##                                33                                 8 
##                        neutrophil                           NK cell 
##                                 6                                 4 
##                             ovary                          pancreas 
##                                 4                                 4 
##                  pancreatic islet                         pituitary 
##                                 4                                 4 
##                          placenta                          platelet 
##                                 4                                 1 
##                          prostate                            rectum 
##                                 4                                 1 
##                    sensory neuron                     sigmoid colon 
##                                 4                                 4 
##                              skin                 skin (suprapubic) 
##                                 8                                 4 
##                   small intestine                            spleen 
##                                 4                                 4 
##                           stomach                          synovium 
##                                 4                                 4 
##                            T cell                            testis 
##                                 4                                 4 
##                          Tfh cell                          Th1 cell 
##                                 4                                 4 
##                       Th1-17 cell                         Th17 cell 
##                                 4                                 4 
##                          Th2 cell                           thyroid 
##                                 4                                 4 
##                      tibial nerve                  transverse colon 
##                                 4                                 5 
##                       Treg memory                        Treg naive 
##                                 4                                 4 
##                            uterus                            vagina 
##                                 4                                 4

There are some interesting tissues to explore, particularly blood and immune cells, so we’ll select them. We’ll also select gene-expression and microarray datasets (since there are some interesting tissues, such as platelets, for which only microarray data is available), and “naive” condition.

tissues <- c("B cell", "CD16+ monocyte", "CD4+ T cell", "CD8+ T cell", "macrophage", "monocyte", "neutrophil", "NK cell", "platelet", "T cell", "Tfh cell", "Th1 cell", "Th1-17 cell", "Th17 cell", "Th2 cell", "Treg memory", "Treg naive")
ecat.filt <- ecat[tissue_label %in% tissues & (quant_method == "ge" | quant_method == "microarray") & condition_label == "naive",]

At this point, we need to load the data. These files are reasonably big and their processing would take a while, so I decided to

  • Download them all to the HPC (~/rds/rds-cew54-wallace-share/Data/expr/eqtl-catalogue),
  • Filter them by our driver SNPs,
  • Filter SNPs by those present in credible sets only,
  • Retrieve extra information on associated genes from BiomaRt.

See eqtl_driverSNPs_processing.R to get a taste of what I did.

Let’s load the full resulting dataset.

causal <- fread("../data/Causal_eQTLs_full_20210915.tsv") # File copied from IMD basis repository, as it contains the required information

# Noticed a bug in the procesing of snp_study and study_tissue. Let's briefly fix it
causal[, snp_study:=gsub(pattern = "(_[A-Za-z0-9\\+ -]+)$", "", snp_study, perl = TRUE)][, study_tissue:=gsub(pattern = "(_[A-Za-z0-9\\+ -]+)$", "", study_tissue, perl = TRUE)][,study:=gsub(pattern = "(_[A-Za-z0-9\\+ -]+)$", "", study, perl = TRUE)][,quant_method:=gsub("([A-Za-z0-9_]+)_([a-z]+)", "\\2", study, perl = TRUE)][,study:=gsub(pattern = "(_[a-z]+)$", "", study, perl = TRUE)]

Data vizzzzz

Principal components

We’ll focus on the different driver SNPs component by component.

PC1

Driver SNPs and cell types

We’ll take a look at how different cell types are influenced by driver SNPs at PC1.

snppc1 <- snp[rot1 != 0]
pc1 <- causal[rot1 != 0]
pc1[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot1)]

snppc1[, eqtl:=ifelse(SNPID.basis %in% pc1$SNPID.basis, "yes", "no")]

pc1dp <- ggplot(snppc1, aes(x = reorder(SNPID.basis, rot1), y = rot1, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc1[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC1 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc1dp

length(unique(pc1$SNPID.basis))
## [1] 15
length(unique(pc1$tissue_label))
## [1] 11
length(unique(pc1$SNPID.basis))/nrow(snppc1)
## [1] 0.1401869

For PC1 we have 15 driver SNPs that are identified as eQTL across 11 tissues.

tcells <- c("CD4+ T cell", "CD8+ T cell", "T cell", "Tfh cell", "Th1 cell", "Th1-17 cell", "Th17 cell", "Th2 cell", "Treg memory", "Treg naive")
otcells <- c("B cell", "CD16+ monocyte", "macrophage", "monocyte", "neutrophil", "NK cell", "platelet")

plot.cells.eqtl <- function(x){
  tcells <- c("CD4+ T cell", "CD8+ T cell", "T cell", "Tfh cell", "Th1 cell", "Th1-17 cell", "Th17 cell", "Th2 cell", "Treg memory", "Treg naive")
  otcells <- c("B cell", "CD16+ monocyte", "macrophage", "monocyte", "neutrophil", "NK cell", "platelet")
  
  ptc <- x[tissue_label %in% tcells]
  poc <- x[tissue_label %in% otcells]
  
  cp1 <- ggplot(ptc, aes(x = SNPID.basis, y = beta, ymin=beta-se, ymax=beta+se, colour = gene_name))+
  geom_pointrange()+
  geom_hline(yintercept = 0, col="red", lty=2)+
  ylab("Beta")+
  scale_x_discrete(drop=FALSE)+
  ylim(c(min(x$beta), max(x$beta)))+
 # ggtitle("PC1 driver SNPs effect on gene expression")+
  facet_grid(tissue_label~.,  scales = "free", space = "free", switch = "y")+
  theme_bw(base_size = 18)+
  theme(legend.position = "none", strip.text.y.left = element_text(angle = 0), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90))
  
  cp2 <- ggplot(poc, aes(x = SNPID.basis, y = beta, ymin=beta-se, ymax=beta+se, colour = gene_name))+
  geom_pointrange()+
  geom_hline(yintercept = 0, col="red", lty=2)+
  ylab("Beta")+
  scale_x_discrete(drop=FALSE)+
  ylim(c(min(x$beta), max(x$beta)))+
 # ggtitle("PC1 driver SNPs effect on gene expression")+
  facet_grid(tissue_label~.,  scales = "free", space = "free", switch = "y")+
  theme_bw(base_size = 18)+
  theme(legend.position = "bottom", strip.text.y.left = element_text(angle = 0), axis.title.y = element_blank(), axis.text.x = element_text(angle = 90))
 
  gridExtra::grid.arrange(cp1, cp2, nrow = 1) 
  
}

pce1 <- plot.cells.eqtl(pc1)
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

PC2

Driver SNPs and cell types

snppc2 <- snp[rot2 != 0]
pc2 <- causal[rot2 != 0]
pc2[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot2)]

snppc2[, eqtl:=ifelse(SNPID.basis %in% pc2$SNPID.basis, "yes", "no")]

pc2dp <- ggplot(snppc2, aes(x = reorder(SNPID.basis, rot2), y = rot2, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc2[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC2 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc2dp

length(unique(pc2$SNPID.basis))
## [1] 24
length(unique(pc2$tissue_label))
## [1] 17
length(unique(pc2$SNPID.basis))/nrow(snppc2)
## [1] 0.15
plot.cells.eqtl(pc2)
## Warning: Removed 7 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

PC3

Driver SNPs and cell types

snppc3 <- snp[rot3 != 0]
pc3 <- causal[rot3 != 0]
pc3[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot3)]

snppc3[, eqtl:=ifelse(SNPID.basis %in% pc3$SNPID.basis, "yes", "no")]

pc3dp <- ggplot(snppc3, aes(x = reorder(SNPID.basis, rot3), y = rot3, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc3[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC3 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc3dp

length(unique(pc3$SNPID.basis))
## [1] 35
length(unique(pc3$tissue_label))
## [1] 17
length(unique(pc3$SNPID.basis))/nrow(snppc3)
## [1] 0.21875
plot.cells.eqtl(pc3)
## Warning: Removed 9 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 2 rows containing missing values (geom_segment).

PC8

Driver SNPs and cell types

snppc8 <- snp[rot8 != 0]
pc8 <- causal[rot8 != 0]
pc8[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot8)]

snppc8[, eqtl:=ifelse(SNPID.basis %in% pc8$SNPID.basis, "yes", "no")]

pc8dp <- ggplot(snppc8, aes(x = reorder(SNPID.basis, rot8), y = rot8, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc8[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC8 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc8dp

length(unique(pc8$SNPID.basis))
## [1] 26
length(unique(pc8$tissue_label))
## [1] 14
length(unique(pc8$SNPID.basis))/nrow(snppc8)
## [1] 0.1954887
plot.cells.eqtl(pc8)
## Warning: Removed 5 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

PC9

Driver SNPs and cell types

snppc9 <- snp[rot9 != 0]
pc9 <- causal[rot9 != 0]
pc9[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot9)]

snppc9[, eqtl:=ifelse(SNPID.basis %in% pc9$SNPID.basis, "yes", "no")]

pc9dp <- ggplot(snppc9, aes(x = reorder(SNPID.basis, rot9), y = rot9, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc9[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC9 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc9dp

length(unique(pc9$SNPID.basis))
## [1] 54
length(unique(pc9$tissue_label))
## [1] 17
length(unique(pc9$SNPID.basis))/nrow(snppc9)
## [1] 0.225
plot.cells.eqtl(pc9)
## Warning: Removed 17 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

PC12

Driver SNPs and cell types

snppc12 <- snp[rot12 != 0]
pc12 <- causal[rot12 != 0]
pc12[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot12)]

snppc12[, eqtl:=ifelse(SNPID.basis %in% pc12$SNPID.basis, "yes", "no")]

pc12dp <- ggplot(snppc12, aes(x = reorder(SNPID.basis, rot12), y = rot12, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc12[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC12 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc12dp

length(unique(pc12$SNPID.basis))
## [1] 51
length(unique(pc12$tissue_label))
## [1] 17
length(unique(pc12$SNPID.basis))/nrow(snppc12)
## [1] 0.1917293
plot.cells.eqtl(pc12)
## Warning: Removed 16 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

PC13

Driver SNPs and cell types

snppc13 <- snp[rot13 != 0]
pc13 <- causal[rot13 != 0]
pc13[, SNPID.basis:=factor(SNPID.basis)][, SNPID.basis:= reorder(SNPID.basis, rot13)]

snppc13[, eqtl:=ifelse(SNPID.basis %in% pc13$SNPID.basis, "yes", "no")]

pc13dp <- ggplot(snppc13, aes(x = reorder(SNPID.basis, rot13), y = rot13, colour = eqtl, label=SNPID.basis))+
          geom_point()+
          scale_colour_manual(values = c("yes" = "red", "no" = "black"))+
          geom_hline(yintercept = 0, col="red", lty=1)+
          geom_label_repel(size = 3, force = 2, box.padding = .3, seed = 2, direction = "both", max.overlaps = 45, data = snppc13[eqtl == "yes"])+
          ylab("Driver SNP Rotation")+
          ggtitle("PC13 driver SNPs")+
          theme_cowplot(12)+
          theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank(), axis.text.x.bottom = element_blank())
pc13dp

length(unique(pc13$SNPID.basis))
## [1] 80
length(unique(pc13$tissue_label))
## [1] 17
length(unique(pc13$SNPID.basis))/nrow(snppc13)
## [1] 0.2144772
plot.cells.eqtl(pc13)
## Warning: Removed 22 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_segment).

–>

–> –> –> –> –>

–>

–>

–>

–> –> –> –>

–> –> –> –> –>

–>

–>

–> –>

–>

–> –> –>

–>

–>

–>

–>

–> –> –> –> –> –>

–>

–> –> –> –>

–>

–> –> –> –>

–>

–> –> –> –>

–>

–> –> –> –>